In [1]:
# Author: Stephen Situ
# This is an exercise for practicing the ETL (Extract-Transform-Load) process for data integration. We have 
# CSV, json, and xml files in our directory. By defining a function for extracting each type, and then running a
# final extract function that loops through all the files, we can create a pandas dataframe that holds the 
# combined data. Additionally, we create a timestamped logfile to track these processes. 
In [ ]:
# Libraries
import glob                         # For selecting files in directory
import pandas as pd                 # For reading csv files
import xml.etree.ElementTree as ET  # For reading xml files
from datetime import datetime       # For Getting date/time
In [2]:
# Set Paths
tmpfile    = "dealership_temp.tmp"               # temp file used to store all extracted data
logfile    = "dealership_logfile.txt"            # Log file
targetfile = "dealership_transformed_data.csv"   # CSV file to store all all data
In [3]:
# Define function to read CSV and return the dataframe
def extract_from_csv(file_to_process):
    dataframe = pd.read_csv(file_to_process)
    return dataframe
In [4]:
# Define function to read CSV and return dataframe
def extract_from_json(file_to_process):
    dataframe = pd.read_json(file_to_process,lines=True)
    return dataframe
In [5]:
# Definte function to read xml file and append values to dataframe
def extract_from_xml(file_to_process):
    dataframe = pd.DataFrame(columns=["car_model", "year_of_manufacture", "price", "fuel"])
    tree = ET.parse(file_to_process)
    root = tree.getroot()
    for row in root:
        car_model = row.find("car_model").text
        year_of_manufacture = row.find("year_of_manufacture").text
        price = float(row.find("price").text)
        fuel = row.find("fuel").text
        dataframe = dataframe.append({"car_model": car_model, "year_of_manufacture":year_of_manufacture, "price": price,"fuel":fuel}, ignore_index=True)
        return dataframe
In [6]:
# Define function to 
def extract():
    
    extracted_data = pd.DataFrame(columns=['car_model','year_of_manufacture','price','fuel']) # create an empty data frame to hold extracted data
    
    # Loop through all csv files in directory, * indicates all files
    for csvfile in glob.glob("*.csv"):
        extracted_data = extracted_data.append(extract_from_csv(csvfile), ignore_index=True)
        
    # Loop through all json files in directroy, * star indicates all files
    for jsonfile in glob.glob("*.json"):
        extracted_data = extracted_data.append(extract_from_json(jsonfile), ignore_index=True)
    
    # Loop through all xml files in directory, * star indicates all files
    for xmlfile in glob.glob("*.xml"):
        extracted_data = extracted_data.append(extract_from_xml(xmlfile), ignore_index=True)
        
    return extracted_data
In [7]:
# Define transformation function to round price to 2 digits
def transform(data):# Add the transform function below
    data['price'] = round(data.price,2)
    return data
In [8]:
# Define load functions to write into CSV file
def load(targetfile,data_to_load):
    data_to_load.to_csv(targetfile)  
In [9]:
# Define log function to write timesteamp in to Log file
# with open, opens logfile.txt and appends
def log(message):
    timestamp_format = '%Y-%h-%d-%H:%M:%S' # Year-Monthname-Day-Hour-Minute-Second
    now = datetime.now() # get current timestamp
    timestamp = now.strftime(timestamp_format)
    with open("logfile.txt","a") as f:
        f.write(timestamp + ',' + message + '\n')
In [10]:
# Start ETL Log
log("ETL Job Started")
In [11]:
# Extract Phase
log("Extract phase Started")
extracted_data = extract()
log("Extract phase Ended")
extracted_data
C:\Users\Steve\AppData\Local\Temp\ipykernel_1456\2660047292.py:8: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  extracted_data = extracted_data.append(extract_from_csv(csvfile), ignore_index=True)
C:\Users\Steve\AppData\Local\Temp\ipykernel_1456\2660047292.py:8: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  extracted_data = extracted_data.append(extract_from_csv(csvfile), ignore_index=True)
C:\Users\Steve\AppData\Local\Temp\ipykernel_1456\2660047292.py:8: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  extracted_data = extracted_data.append(extract_from_csv(csvfile), ignore_index=True)
C:\Users\Steve\AppData\Local\Temp\ipykernel_1456\2660047292.py:12: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  extracted_data = extracted_data.append(extract_from_json(jsonfile), ignore_index=True)
C:\Users\Steve\AppData\Local\Temp\ipykernel_1456\2660047292.py:12: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  extracted_data = extracted_data.append(extract_from_json(jsonfile), ignore_index=True)
C:\Users\Steve\AppData\Local\Temp\ipykernel_1456\2660047292.py:12: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  extracted_data = extracted_data.append(extract_from_json(jsonfile), ignore_index=True)
C:\Users\Steve\AppData\Local\Temp\ipykernel_1456\3203211828.py:11: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  dataframe = dataframe.append({"car_model": car_model, "year_of_manufacture":year_of_manufacture, "price": price,"fuel":fuel}, ignore_index=True)
C:\Users\Steve\AppData\Local\Temp\ipykernel_1456\2660047292.py:16: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  extracted_data = extracted_data.append(extract_from_xml(xmlfile), ignore_index=True)
C:\Users\Steve\AppData\Local\Temp\ipykernel_1456\3203211828.py:11: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  dataframe = dataframe.append({"car_model": car_model, "year_of_manufacture":year_of_manufacture, "price": price,"fuel":fuel}, ignore_index=True)
C:\Users\Steve\AppData\Local\Temp\ipykernel_1456\2660047292.py:16: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  extracted_data = extracted_data.append(extract_from_xml(xmlfile), ignore_index=True)
C:\Users\Steve\AppData\Local\Temp\ipykernel_1456\3203211828.py:11: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  dataframe = dataframe.append({"car_model": car_model, "year_of_manufacture":year_of_manufacture, "price": price,"fuel":fuel}, ignore_index=True)
C:\Users\Steve\AppData\Local\Temp\ipykernel_1456\2660047292.py:16: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  extracted_data = extracted_data.append(extract_from_xml(xmlfile), ignore_index=True)
Out[11]:
car_model year_of_manufacture price fuel
0 ritz 2014 5000.000000 Petrol
1 sx4 2013 7089.552239 Diesel
2 ciaz 2017 10820.895522 Petrol
3 wagon r 2011 4253.731343 Petrol
4 swift 2014 6865.671642 Diesel
... ... ... ... ...
58 etios g 2014 6119.402985 Petrol
59 fortuner 2014 29835.820896 Diesel
60 corolla altis 2013 10373.134328 Petrol
61 etios liva 2014 5895.522388 Diesel
62 corolla altis 2016 21985.074627 Diesel

63 rows × 4 columns

In [12]:
# Transform phase
log("Transform phase Started")
transformed_data = transform(extracted_data)
log("Transform phase Ended")
transformed_data 
Out[12]:
car_model year_of_manufacture price fuel
0 ritz 2014 5000.00 Petrol
1 sx4 2013 7089.55 Diesel
2 ciaz 2017 10820.90 Petrol
3 wagon r 2011 4253.73 Petrol
4 swift 2014 6865.67 Diesel
... ... ... ... ...
58 etios g 2014 6119.40 Petrol
59 fortuner 2014 29835.82 Diesel
60 corolla altis 2013 10373.13 Petrol
61 etios liva 2014 5895.52 Diesel
62 corolla altis 2016 21985.07 Diesel

63 rows × 4 columns

In [13]:
# load phase
log("Load phase Started")
load(targetfile,transformed_data)
log("Load phase Ended")
In [14]:
# End ETL
log("ETL Job Ended")
In [17]:
# Check Dataframe contents
Finished_df = pd.read_csv('dealership_transformed_data.csv')
Finished_df
Out[17]:
Unnamed: 0 car_model year_of_manufacture price fuel
0 0 ritz 2014 5000.00 Petrol
1 1 sx4 2013 7089.55 Diesel
2 2 ciaz 2017 10820.90 Petrol
3 3 wagon r 2011 4253.73 Petrol
4 4 swift 2014 6865.67 Diesel
... ... ... ... ... ...
58 58 etios g 2014 6119.40 Petrol
59 59 fortuner 2014 29835.82 Diesel
60 60 corolla altis 2013 10373.13 Petrol
61 61 etios liva 2014 5895.52 Diesel
62 62 corolla altis 2016 21985.07 Diesel

63 rows × 5 columns

In [18]:
# Check Logfile
with open('logfile.txt') as f:
    contents = f.read()
    print(contents)
2022-Nov-28-17:26:59,ETL Job Started
2022-Nov-28-17:27:01,Extract phase Started
2022-Nov-28-17:27:01,Extract phase Ended
2022-Nov-28-17:27:04,Transform phase Started
2022-Nov-28-17:27:04,Transform phase Ended
2022-Nov-28-17:27:06,Load phase Started
2022-Nov-28-17:27:06,Load phase Ended
2022-Nov-28-17:27:08,ETL Job Ended